In [ ]:
from osgeo import gdal
import pandas as pd
import geopandas as gp
from matplotlib import pyplot as plt
from matplotlib import colors
from shapely.geometry import Point
import py7zr
1. Import files from Google Drive and extract them into data folder¶
In [ ]:
import gdown
# Google drive public shared folder ID
id = "1vdm8b0ewDslPVvubA8emwDYHUGtwx5pT"
gdown.download_folder(id=id,output="../data", quiet=True, use_cookies=False)
Access denied with the following error:
Cannot retrieve the public link of the file. You may need to change the permission to 'Anyone with the link', or have had many accesses. You may still be able to access the file from the browser: https://drive.google.com/uc?id=1GZNvNuEBCOMfmtZ-t3ppkgbSUJKtWwOH
In [ ]:
import py7zr
# Extract files from 7zip file
with py7zr.SevenZipFile('../data/bike_data.7z', mode='r') as z:
z.extractall("../data")
2. Read files¶
In [ ]:
df = pd.read_csv('../data/accidentsVelo.csv')
df.head()
/tmp/ipykernel_13264/1121051140.py:1: DtypeWarning: Columns (8,9,20,21,30) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv('../data/accidentsVelo.csv')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[68], line 2 1 df = pd.read_csv('../data/accidentsVelo.csv') ----> 2 display(Markdown(df.head())) File ~/Projects/bike_risk_map/bike_env/lib/python3.10/site-packages/IPython/core/display.py:328, in DisplayObject.__init__(self, data, url, filename, metadata) 325 self.metadata = {} 327 self.reload() --> 328 self._check_data() File ~/Projects/bike_risk_map/bike_env/lib/python3.10/site-packages/IPython/core/display.py:407, in TextDisplayObject._check_data(self) 405 def _check_data(self): 406 if self.data is not None and not isinstance(self.data, str): --> 407 raise TypeError("%s expects text, not %r" % (self.__class__.__name__, self.data)) TypeError: Markdown expects text, not Num_Acc date an mois jour hrmn dep com \ 0 200500000030 2005-01-13 2005 janvier jeudi 19:45 62 62331 1 200500000034 2005-01-19 2005 janvier mercredi 10:45 62 62022 2 200500000078 2005-01-26 2005 janvier mercredi 13:15 02 02173 3 200500000093 2005-01-03 2005 janvier lundi 13:30 02 02810 4 200500000170 2005-01-29 2005 janvier samedi 18:30 76 76196 lat long ... secuexist equipement obs obsm choc manv \ 0 50.3 2.84 ... 0 0 0.0 2.0 8.0 11.0 1 0.0 0.0 ... 0 0 0.0 2.0 1.0 1.0 2 0.0 0.0 ... 1 2 0.0 2.0 1.0 1.0 3 49.255 3.094 ... 0 0 0.0 2.0 3.0 21.0 4 0.0 0.0 ... 1 9 0.0 2.0 4.0 2.0 vehiculeid typevehicules manoeuvehicules numVehicules 0 200500000030B02 18 17 1.0 1 200500000034B02 10 15 1.0 2 200500000078B02 7 15 1.0 3 200500000093B02 7 21 1.0 4 200500000170A01 10 2 1.0 [5 rows x 39 columns]
Removing NA's on lat and long long columns only¶
In [ ]:
num_rows = len(df)
df = df.loc[(df["lat"].notna() & df["long"].notna())]
num_rows_not_na = len(df)
print("Removed NA's rows : %d" % (num_rows -num_rows_not_na) )
Removed NA's rows : 268
Check column types and convert if necessary¶
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 74490 entries, 0 to 74757 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Num_Acc 74490 non-null int64 1 date 74490 non-null object 2 an 74490 non-null int64 3 mois 74490 non-null object 4 jour 74490 non-null object 5 hrmn 74490 non-null object 6 dep 74490 non-null object 7 com 74490 non-null object 8 lat 74490 non-null object 9 long 74490 non-null object 10 agg 74490 non-null int64 11 int 74490 non-null int64 12 col 74486 non-null float64 13 lum 74490 non-null int64 14 atm 74488 non-null float64 15 catr 74490 non-null int64 16 circ 74347 non-null float64 17 nbv 74293 non-null float64 18 prof 74314 non-null float64 19 plan 74291 non-null float64 20 lartpc 63519 non-null object 21 larrout 69399 non-null object 22 surf 74317 non-null float64 23 infra 73955 non-null float64 24 situ 73997 non-null float64 25 grav 74490 non-null int64 26 sexe 74490 non-null int64 27 age 74464 non-null float64 28 trajet 74487 non-null float64 29 secuexist 74490 non-null int64 30 equipement 70749 non-null object 31 obs 74464 non-null float64 32 obsm 74449 non-null float64 33 choc 74478 non-null float64 34 manv 74474 non-null float64 35 vehiculeid 74490 non-null object 36 typevehicules 64223 non-null object 37 manoeuvehicules 64206 non-null object 38 numVehicules 64223 non-null float64 dtypes: float64(16), int64(9), object(14) memory usage: 22.7+ MB
Convert lat and long cols to float¶
In [ ]:
df['lat'] = df['lat'].apply(lambda x: float(str(x).replace(',', '.')))
df['long'] = df['long'].apply(lambda x: float(str(x).replace(',', '.')))
In [ ]:
print("Lat and long colum type are |{}| and |{}| ".format(df.dtypes['lat'],df.dtypes['long'] ))
Lat and long colum type are |float64| and |float64|
Checking zeros on lat and long cols and total number of cols¶
In [ ]:
count_lat = df['lat'].value_counts()[0]
count_long = df['long'].value_counts()[0]
rows = len(df)
results = pd.DataFrame(data={"lat_with_zero":[count_lat], "long_with_zero":[count_long],"Total number of rows": [rows]})
results
Out[ ]:
| lat_with_zero | long_with_zero | Total number of rows | |
|---|---|---|---|
| 0 | 42084 | 43131 | 74490 |
Remove rows where lat and long columns are not 0¶
In [ ]:
df_filtered = df.loc[(df['lat'] != 0) & (df['long'] != 0)].copy()
print("Non zero coordinates data frame has %d rows" %(len(df_filtered)))
Non zero coordinates data frame has 31355 rows
Convert to a proper geodataframe¶
In [ ]:
# Convert longitude and latitude to a geometric Point object
points_gdf = gp.GeoDataFrame(df_filtered, geometry=gp.points_from_xy(df_filtered.lat, df_filtered.long))
# Convert DataFrame to GeoDataFrame
points_gdf = points_gdf.set_crs('epsg:4326')
points_gdf.plot(aspect='equal')
print(points_gdf.crs)
epsg:4326
Its seems the coordinates are inverted, let's fix this¶
In [ ]:
# Convert longitude and latitude to a geometric Point object
points_gdf = gp.GeoDataFrame(df_filtered, geometry=gp.points_from_xy(df_filtered.long, df_filtered.lat))
# Convert DataFrame to GeoDataFrame
points_gdf = points_gdf.set_crs('epsg:4326').to_crs('epsg:27561')
points_gdf.plot()
print(points_gdf.crs)
epsg:27561
Check columns¶
In [ ]:
points_gdf.head()
Out[ ]:
| Num_Acc | date | an | mois | jour | hrmn | dep | com | lat | long | ... | equipement | obs | obsm | choc | manv | vehiculeid | typevehicules | manoeuvehicules | numVehicules | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 200500000030 | 2005-01-13 | 2005 | janvier | jeudi | 19:45 | 62 | 62331 | 50.300 | 2.840 | ... | 0 | 0.0 | 2.0 | 8.0 | 11.0 | 200500000030B02 | 18 | 17 | 1.0 | POINT (635873.910 289101.859) |
| 3 | 200500000093 | 2005-01-03 | 2005 | janvier | lundi | 13:30 | 02 | 02810 | 49.255 | 3.094 | ... | 0 | 0.0 | 2.0 | 3.0 | 21.0 | 200500000093B02 | 7 | 21 | 1.0 | POINT (655137.321 173039.590) |
| 6 | 200500000260 | 2005-01-04 | 2005 | janvier | mardi | 16:45 | 22 | 22143 | 48.590 | -2.300 | ... | 0 | 0.0 | 2.0 | 1.0 | 5.0 | 200500000260B02 | 7 | 1 | 1.0 | POINT (258175.124 109337.357) |
| 10 | 200500000291 | 2005-01-24 | 2005 | janvier | lundi | 17:30 | 56 | 56155 | 47.485 | -2.467 | ... | 2 | 0.0 | 2.0 | 3.0 | 15.0 | 200500000291B02 | 7 | 1 | 1.0 | POINT (238049.201 -12534.885) |
| 14 | 200500000329 | 2005-01-04 | 2005 | janvier | mardi | 19:00 | 50 | 50213 | 49.086 | -1.092 | ... | 0 | 0.0 | 2.0 | 4.0 | 13.0 | 200500000329B02 | 7 | 2 | 1.0 | POINT (349671.489 159668.389) |
5 rows × 40 columns
Plot France department limits and bike accident points¶
In [ ]:
france_gdf = gp.read_file("../data/france_dep.geojson")
france_gdf = france_gdf.to_crs('epsg:27561')
ax = points_gdf.plot(marker='o', color='red', markersize=5,aspect=1)
france_gdf.plot(ax=ax, color='white', edgecolor='black')
plt.show()
Let's grab only the points inside France and plot bicycle accident points together¶
In [ ]:
points_within = gp.sjoin(points_gdf,france_gdf,predicate='within')
In [ ]:
ax = points_within.plot(legend=True, markersize=3,alpha=0.5,figsize=(12,12))
france_gdf.plot(ax=ax, color='white', alpha=0.5, edgecolor="k")
plt.show()
Checking if the gravity of the accident can be spatially explained¶
In [ ]:
fig, axs = plt.subplots(2, 2)
points_within_grav_1 = points_within.loc[points_within['grav'] == 1]
points_within_grav_1.plot(ax=axs[0, 0],markersize=3,alpha=0.5,color='green')
axs[0, 0].set_title("Low")
points_within_grav_2 = points_within.loc[points_within['grav'] == 2]
points_within_grav_2.plot(ax=axs[0, 1],markersize=3,alpha=0.5,color='yellow')
axs[0, 1].set_title("Mild")
axs[1, 0].sharex(axs[0, 0])
points_within_grav_3 = points_within.loc[points_within['grav'] == 3]
points_within_grav_3.plot(ax=axs[1, 0],markersize=3,alpha=0.5,color='orange')
axs[1, 0].set_title("High")
points_within_grav_4 = points_within.loc[points_within['grav'] == 4]
points_within_grav_4.plot(ax=axs[1, 1],markersize=3,alpha=0.5,color='red')
axs[1, 1].set_title("Very high")
fig.tight_layout()
It seems that the accidents of types are well distributed across the country and concentrated on major cities. Also, it may indicate that the gravity of an accident is subjective.¶
Lets check the density of accidents (regardless of it's gravity) by region population¶
In [ ]:
accidents_by_department = points_gdf.sjoin(france_gdf,predicate='within')
accidents_by_department.plot()
Out[ ]:
<Axes: >
Plot accidents by gravity and by department¶
In [ ]:
accidents_by_region = accidents_by_department.dissolve(by='nom', aggfunc='sum',numeric_only=False)
# Remove index field names
france_gdf=france_gdf.loc[:,~france_gdf.columns.str.startswith('index_')]
accidents_by_region=accidents_by_region.loc[:,~accidents_by_region.columns.str.startswith('index_')]
departments_by_accidents = gp.sjoin(france_gdf,accidents_by_region,how="inner",predicate="intersects")
cmap = colors.LinearSegmentedColormap.from_list("", ["green","yellow","orange","red"])
departments_by_accidents.plot(column = 'grav', scheme='quantiles', cmap=cmap,figsize=(10,10),legend=True);
plt.title("Number of accidents by gravity and department")
plt.show()
/home/paulo/Projects/bike_risk_map/bike_env/lib/python3.10/site-packages/geopandas/geodataframe.py:1780: FutureWarning: Dropping invalid columns in DataFrameGroupBy.sum is deprecated. In a future version, a TypeError will be raised. Before calling .sum, select only columns which should be valid for the function. aggregated_data = data.groupby(**groupby_kwargs).agg(aggfunc, **kwargs)
In [ ]:
departments_by_accidents[["nom","grav"]].sort_values("grav",ascending=False)
Out[ ]:
| nom | grav | |
|---|---|---|
| 36 | Paris | 17513 |
| 82 | Maine-et-Loire | 3482 |
| 5 | Ille-et-Vilaine | 3112 |
| 33 | Pyrénées-Atlantiques | 2461 |
| 54 | Gironde | 2376 |
| ... | ... | ... |
| 81 | Lot | 158 |
| 48 | Corse-du-Sud | 151 |
| 30 | Lozère | 118 |
| 95 | Territoire de Belfort | 113 |
| 50 | Creuse | 76 |
96 rows × 2 columns
By using a quantilles representation, we notice that touristic zones have a high number of accidents. But Paris, on absolute numbers, is still way higher than other departments¶
Removing spaces and upper cases from cols of both dataframes¶
In [ ]:
accidents_by_region.columns = [x.lower().replace(' ','') for x in accidents_by_region.columns]
accidents_by_department.columns = [x.lower().replace(' ','') for x in accidents_by_department.columns]
Calculate the occurences of accidents by region and add this info to the geodataframe with polygons¶
In [ ]:
num_accidents_per_region = pd.DataFrame({'total':accidents_by_department['nom'].value_counts()}).reset_index().rename(columns={'index': 'nom'})
num_accidents_per_region
Out[ ]:
| nom | total | |
|---|---|---|
| 0 | Paris | 5001 |
| 1 | Maine-et-Loire | 994 |
| 2 | Ille-et-Vilaine | 892 |
| 3 | Gironde | 704 |
| 4 | Pyrénées-Atlantiques | 703 |
| ... | ... | ... |
| 91 | Lot | 56 |
| 92 | Corse-du-Sud | 49 |
| 93 | Territoire de Belfort | 38 |
| 94 | Lozère | 37 |
| 95 | Creuse | 24 |
96 rows × 2 columns
In [ ]:
# Merge
accidents_by_region_and_name = accidents_by_department.merge(num_accidents_per_region, on='nom')
region_by_accidents = france_gdf.merge(num_accidents_per_region, on='nom')
In [ ]:
accidents_by_region_and_name.plot(column='grav')
Out[ ]:
<Axes: >
In [ ]:
region_by_accidents.plot(column='total',cmap=cmap,scheme='equalinterval',legend=True,figsize=(10,10))
#boxplot', 'equalinterval', 'fisherjenks', 'fisherjenkssampled', 'headtailbreaks', 'jenkscaspall', 'jenkscaspallforced',
#'jenkscaspallsampled', 'maxp', 'maximumbreaks', 'naturalbreaks', 'quantiles', 'percentiles', 'prettybreaks', 'stdmean', 'userdefined'
Out[ ]:
<Axes: >
By using equal intervals, the total number of accidents in Paris is way higher than in most regions of france. Let's make the same plot by using region population instead¶
In [ ]:
population = pd.read_excel('../data/TCRD_004.xlsx',index_col=[0])
population_filtered = population[['nom','2023 (p)']].copy().rename(columns = {'2023 (p)':'pop_2023'})
population_filtered
Out[ ]:
| nom | pop_2023 | |
|---|---|---|
| 01 | Ain | 671937 |
| 02 | Aisne | 522791 |
| 03 | Allier | 332443 |
| 04 | Alpes-de-Haute-Provence | 166654 |
| 05 | Hautes-Alpes | 139942 |
| ... | ... | ... |
| 91 | Essonne | 1316053 |
| 92 | Hauts-de-Seine | 1642002 |
| 93 | Seine-Saint-Denis | 1682806 |
| 94 | Val-de-Marne | 1426748 |
| 95 | Val-d'Oise | 1274374 |
96 rows × 2 columns
In [ ]:
region_by_accidents_pop2023 = region_by_accidents.merge(population_filtered,on='nom')
region_by_accidents_pop2023['acc_per_hab'] = (region_by_accidents_pop2023['total'] / region_by_accidents_pop2023['pop_2023']) * 1000
In [ ]:
region_by_accidents_pop2023.sort_values(by=['acc_per_hab'],ascending=[False]).head(10)
Out[ ]:
| code | nom | geometry | total | pop_2023 | acc_per_hab | |
|---|---|---|---|---|---|---|
| 36 | 75 | Paris | POLYGON ((598781.702 133337.696, 603566.179 13... | 5001 | 2102650 | 2.378427 |
| 21 | 05 | Hautes-Alpes | POLYGON ((909410.352 -278534.727, 910653.954 -... | 170 | 139942 | 1.214789 |
| 82 | 49 | Maine-et-Loire | POLYGON ((331522.762 14753.942, 332270.240 184... | 994 | 828269 | 1.200093 |
| 60 | 65 | Hautes-Pyrénées | MULTIPOLYGON (((400774.858 -493563.673, 399221... | 264 | 230583 | 1.144924 |
| 33 | 64 | Pyrénées-Atlantiques | POLYGON ((390674.278 -454989.449, 391301.192 -... | 703 | 697899 | 1.007309 |
| 55 | 36 | Indre | POLYGON ((523410.499 -56802.253, 524321.781 -5... | 193 | 214914 | 0.898034 |
| 16 | 68 | Haut-Rhin | POLYGON ((960414.264 79357.239, 962829.090 792... | 653 | 769231 | 0.848900 |
| 84 | 56 | Morbihan | MULTIPOLYGON (((205667.370 -26249.380, 206759.... | 641 | 777383 | 0.824561 |
| 25 | 17 | Charente-Maritime | MULTIPOLYGON (((305094.968 -158476.033, 306205... | 547 | 665904 | 0.821440 |
| 5 | 35 | Ille-et-Vilaine | MULTIPOLYGON (((279606.174 105397.241, 276566.... | 892 | 1118600 | 0.797425 |
In [ ]:
region_by_accidents_pop2023.plot(column='acc_per_hab',scheme='equalinterval',k=3,cmap=cmap,legend=True,figsize=(10,10))
Out[ ]:
<Axes: >
Maybe the cyclable rods can provide more info¶
In [ ]:
pistes_cyclable = gp.read_file("../data/france-20230901.geojson")
pistes_cyclable = pistes_cyclable.to_crs('epsg:27561')
pistes_cyclable.plot()
Out[ ]:
<Axes: >
In [ ]:
pistes_cyclable.head(1)
Out[ ]:
| id_local | id_osm | num_iti | code_com_d | ame_d | regime_d | sens_d | largeur_d | local_d | statut_d | ... | revet_g | access_ame | date_maj | trafic_vit | lumiere | d_service | source | project_c | ref_geo | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | geovelo_967104105_65226 | 967104105 | NaN | 65226 | VOIE VERTE | AUTRE | UNIDIRECTIONNEL | NaN | NaN | EN SERVICE | ... | NaN | NaN | 2021-07-25 | 5.0 | NaN | NaN | Les contributeurs OpenStreetmap | 4326 | OpenStreetmap | LINESTRING (411941.578 -496613.093, 411959.247... |
1 rows × 28 columns
Check some colums and unique values¶
In [ ]:
ame = pd.Series(u for u in pistes_cyclable['ame_d'].unique())
sens_d = pd.Series(u for u in pistes_cyclable['sens_d'].unique())
revet_g = pd.Series(u for u in pistes_cyclable['revet_g'].unique())
access_ame = pd.Series(u for u in pistes_cyclable['access_ame'].unique())
lumiere = pd.Series(u for u in pistes_cyclable['lumiere'].unique())
In [ ]:
df_unique = pd.DataFrame({'track_type': ame, 'track_direction':sens_d,'track_material':revet_g,'track_acess':access_ame,'lumiere':lumiere})
df_unique
Out[ ]:
| track_type | track_direction | track_material | track_acess | lumiere | |
|---|---|---|---|---|---|
| 0 | VOIE VERTE | UNIDIRECTIONNEL | NaN | NaN | NaN |
| 1 | AUTRE | BIDIRECTIONNEL | LISSE | VELO DE ROUTE | True |
| 2 | AUCUN | NaN | RUGUEUX | ROLLER | False |
| 3 | AMENAGEMENT MIXTE PIETON VELO HORS VOIE VERTE | NaN | MEUBLE | VTC | NaN |
| 4 | BANDE CYCLABLE | NaN | NaN | VTT | NaN |
| 5 | PISTE CYCLABLE | NaN | NaN | NaN | NaN |
| 6 | COULOIR BUS+VELO | NaN | NaN | NaN | NaN |
| 7 | GOULOTTE | NaN | NaN | NaN | NaN |
| 8 | CHAUSSEE A VOIE CENTRALE BANALISEE | NaN | NaN | NaN | NaN |
| 9 | ACCOTEMENT REVETU HORS CVCB | NaN | NaN | NaN | NaN |
| 10 | DOUBLE SENS CYCLABLE BANDE | NaN | NaN | NaN | NaN |
| 11 | VELO RUE | NaN | NaN | NaN | NaN |
| 12 | DOUBLE SENS CYCLABLE PISTE | NaN | NaN | NaN | NaN |
| 13 | NaN | NaN | NaN | NaN | NaN |
Plot proportions of NA for each col¶
In [ ]:
null_track_type_freq = pistes_cyclable['ame_d'].isnull().sum();
not_null_track_type_freq = pistes_cyclable['ame_d'].notnull().sum();
null_track_direction_freq = pistes_cyclable['sens_d'].isnull().sum();
not_null_track_direction_freq = pistes_cyclable['sens_d'].notnull().sum();
null_track_material_freq = pistes_cyclable['revet_g'].isnull().sum();
not_null_track_material_freq = pistes_cyclable['revet_g'].notnull().sum();
null_track_acess_freq = pistes_cyclable['access_ame'].isnull().sum();
not_null_track_acess_freq = pistes_cyclable['access_ame'].notnull().sum();
null_lumiere_freq = pistes_cyclable['lumiere'].isnull().sum();
not_null_lumiere_freq = pistes_cyclable['lumiere'].notnull().sum();
stats = pd.DataFrame({'Col' : ['Track type','Track dir','Track material','Track acess','Lumiere'],
'NA': [null_track_type_freq,null_track_direction_freq,null_track_material_freq,null_track_acess_freq,null_lumiere_freq],
'Not NA': [not_null_track_type_freq,not_null_track_direction_freq,not_null_track_material_freq,not_null_track_acess_freq,not_null_lumiere_freq]})
stats.set_index('Col')
stats
Out[ ]:
| Col | NA | Not NA | |
|---|---|---|---|
| 0 | Track type | 13 | 293803 |
| 1 | Track dir | 61 | 293755 |
| 2 | Track material | 110912 | 182904 |
| 3 | Track acess | 260307 | 33509 |
| 4 | Lumiere | 223554 | 70262 |
Since there are too many null values regarding track material, acess type and presence of ligtht, let's explore track type and track direction¶
In [ ]:
# Starting by track type
pistes_cyclable_type_not_na = pistes_cyclable.loc[pistes_cyclable['ame_d'].notnull()]
print("Number of track types with null values : ", pistes_cyclable_type_not_na['ame_d'].isnull().sum())
Number of track types with null values : 0
Visuzalize the top 5 departments by accidents and extract the first one¶
In [ ]:
top5 = region_by_accidents_pop2023.sort_values(by=['acc_per_hab'],ascending=[False]).head(5)
print(top5[['nom','pop_2023']])
paris = top5.head(1)
paris
nom pop_2023 36 Paris 2102650 21 Hautes-Alpes 139942 82 Maine-et-Loire 828269 60 Hautes-Pyrénées 230583 33 Pyrénées-Atlantiques 697899
Out[ ]:
| code | nom | geometry | total | pop_2023 | acc_per_hab | |
|---|---|---|---|---|---|---|
| 36 | 75 | Paris | POLYGON ((598781.702 133337.696, 603566.179 13... | 5001 | 2102650 | 2.378427 |
In [ ]:
ax = paris.plot(color="white", edgecolor="black", figsize=(20, 10))
# Drop cols with index_ suffix
accidents_by_region_and_name=accidents_by_region_and_name.loc[:,~accidents_by_region_and_name.columns.str.startswith('index_')]
# Intersect tracks with paris
region_intersect_track_type = gp.sjoin(pistes_cyclable_type_not_na, paris, predicate='intersects')
# Intersect accidents with paris
accidents_intersect_type = gp.sjoin(accidents_by_region_and_name, paris, predicate='within')
# Define a colormap
region_intersect_cmap = colors.LinearSegmentedColormap.from_list(region_intersect_track_type["ame_d"].unique(), list(reversed(["green","yellow","orange","red"])))
region_intersect_track_type.plot(ax=ax,cmap='turbo',column='ame_d',legend=True)
accidents_intersect_type.plot(ax=ax,color="blue",legend=True,alpha=0.5)
Out[ ]:
<Axes: >
There doesnt seem to have a direct relation between accidents and track type. Lets see about the track direction¶
In [ ]:
# Starting by track type
pistes_cyclable_direction_not_na = pistes_cyclable.loc[pistes_cyclable['ame_d'].notnull()]
print("Number of track types with null values : ", pistes_cyclable_direction_not_na['ame_d'].isnull().sum())
Number of track types with null values : 0
In [ ]:
# Intersect tracks with paris
ax = paris.plot(color="white", edgecolor="black",alpha=0.5, figsize=(20, 10))
region_intersect_track_direction = gp.sjoin(pistes_cyclable_direction_not_na, paris, predicate='intersects')
# Intersect accidents with paris
accidents_intersect_track_direction = gp.sjoin(accidents_by_region_and_name, paris, predicate='within')
# Define a colormap
region_intersect_cmap = colors.LinearSegmentedColormap.from_list(region_intersect_track_direction["sens_d"].unique(), list(reversed(["green","red"])))
region_intersect_track_direction.plot(ax=ax,column='sens_d',cmap=region_intersect_cmap,legend=True)
accidents_intersect_track_direction.plot(ax=ax,column="age",color='blue',legend=True,alpha=0.5)
/home/paulo/Projects/bike_risk_map/bike_env/lib/python3.10/site-packages/geopandas/plotting.py:658: UserWarning: Only specify one of 'column' or 'color'. Using 'color'. warnings.warn(
Out[ ]:
<Axes: >
Accidents by period of the year¶
In [ ]:
months = list(accidents_intersect_track_direction['mois'].unique()).sort
months_order = ['janvier', 'février', 'mars', 'avril',
'mai', 'juin', 'juillet', 'août',
'septembre', 'octobre', 'novembre', 'décembre']
In [ ]:
accidents_intersect_track_direction_grouped = accidents_intersect_track_direction.sort_values('mois', ascending=True).groupby('mois').size().reset_index(name ='Accidents')
accidents_intersect_track_direction_sorted_grouped = accidents_intersect_track_direction_grouped.sort_values('mois', key=lambda s: s.apply(months_order.index), ignore_index=True)
accidents_intersect_track_direction_sorted_grouped
Out[ ]:
| mois | Accidents | |
|---|---|---|
| 0 | janvier | 359 |
| 1 | février | 319 |
| 2 | mars | 409 |
| 3 | avril | 391 |
| 4 | mai | 452 |
| 5 | juin | 623 |
| 6 | juillet | 519 |
| 7 | août | 295 |
| 8 | septembre | 507 |
| 9 | octobre | 449 |
| 10 | novembre | 360 |
| 11 | décembre | 318 |
In [ ]:
import seaborn as sns
sns.set_theme()
fig, ax = plt.subplots(figsize=(12, 6))
sns.lineplot(data=accidents_intersect_track_direction_sorted_grouped, x="mois",y='Accidents').set_title("Accidents per month in paris")
ax.tick_params(axis='x', labelrotation = 45)
plt.show()
It's cleaar that population has a hight weight on the bicycle , since during the month June and July, we have a incresed number of tourists in Paris. The month of September, is the first after vaccation in France¶
Let's make cloroplets, using arrondissement zones in paris¶
In [ ]:
import geoplot as gplt
# Read polygons of arrondissement
arrondis_gdf = gp.read_file("../data/arrondissements.geojson")
arrondis_gdf = arrondis_gdf.to_crs('epsg:27561')
# Drop cols with index_ suffix
#paris_accidents = paris_accidents.loc[:,~paris_accidents.columns.str.startswith('index_')]
# Extract only accidents in paris (by name)
paris_accidents = accidents_by_department.loc[accidents_by_department['nom'] == 'Paris']
# Merge paris accidents with population data and remove cols with index_ (from merge result)
paris_accidents_by_pop2023 = paris_accidents.merge(population_filtered,on='nom')
paris_accidents_by_pop2023 = paris_accidents_by_pop2023.loc[:,~paris_accidents_by_pop2023.columns.str.startswith('index_')]
# Spatial join between arrondissement (polygon) and accidents in paris (points), using 'intersects predicate
paris_arrondis_accidents_by_pop2023 = gp.sjoin(arrondis_gdf, paris_accidents_by_pop2023, predicate='intersects')
# Group by arronsidement names and merge with popsulation
population_by_arrondissement = paris_arrondis_accidents_by_pop2023.groupby('l_ar').size().reset_index(name='num_accidents')
paris_accidents_by_pop2023_arrondi = paris_arrondis_accidents_by_pop2023.merge(population_by_arrondissement,on='l_ar')
# Ploting result
ax = arrondis_gdf.plot(color="white",edgecolor="black",alpha=0.5, figsize=(20, 12))
paris_accidents_by_pop2023_arrondi.plot(ax=ax,column='num_accidents',cmap=region_intersect_cmap.reversed(),legend=True)
# Plot arrondissement labels based on polygons centroids
paris_accidents_by_pop2023_arrondi.apply(lambda x: ax.annotate(text= x['l_ar'].split(' ')[0], xy=x.geometry.centroid.coords[0], ha='center'), axis=1);
ax.set_title('Number of accidents by arrondissement', fontsize=13);
And plot data¶
In [ ]:
# Order by accident
population_by_arrondissement_ordered = population_by_arrondissement.sort_values('num_accidents')
# Split X axis labels to extract first word
labels = list(population_by_arrondissement_ordered['l_ar'].apply(lambda x: x.split(' ')[0]))
# Define an axis with plot and its parameters
ax = population_by_arrondissement.sort_values('num_accidents').plot(kind='bar',x='l_ar',
xlabel="Arrondisement",
ylabel="Number of accidents")
ax.set_xticklabels(labels,rotation=45)
# Plot
plt.show()
They are not so representative, since we cant see where are the zones where accidents occur more ofter. Lets make a grid. A greate function can be found here : https://pygis.io/docs/e_summarize_vector.html¶
In [ ]:
import math
import numpy as np
from shapely.geometry import Polygon, box
def create_grid(feature, shape, side_length):
'''Create a grid consisting of either rectangles or hexagons with a specified side length that covers the extent of input feature.'''
# Slightly displace the minimum and maximum values of the feature extent by creating a buffer
# This decreases likelihood that a feature will fall directly on a cell boundary (in between two cells)
# Buffer is projection dependent (due to units)
feature = feature.buffer(20)
# Get extent of buffered input feature
min_x, min_y, max_x, max_y = feature.total_bounds
# Shape area
area = 0
# Create empty list to hold individual cells that will make up the grid
cells_list = []
# Create grid of squares if specified
if shape in ["square", "rectangle", "box"]:
# Adapted from https://james-brennan.github.io/posts/fast_gridding_geopandas/
# Create and iterate through list of x values that will define column positions with specified side length
for x in np.arange(min_x - side_length, max_x + side_length, side_length):
# Create and iterate through list of y values that will define row positions with specified side length
for y in np.arange(min_y - side_length, max_y + side_length, side_length):
# Create a box with specified side length and append to list
cells_list.append(box(x, y, x + side_length, y + side_length))
est = (max_x - min_x) / length(cells_list)
north = (max_y - min_y) / length(cells_list)
area = (est * north)
# Otherwise, create grid of hexagons
elif shape == "hexagon":
# Set horizontal displacement that will define column positions with specified side length (based on normal hexagon)
x_step = 1.5 * side_length
# Set vertical displacement that will define row positions with specified side length (based on normal hexagon)
# This is the distance between the centers of two hexagons stacked on top of each other (vertically)
y_step = math.sqrt(3) * side_length
# Get apothem (distance between center and midpoint of a side, based on normal hexagon)
apothem = (math.sqrt(3) * side_length / 2)
# Set column number
column_number = 0
# Create and iterate through list of x values that will define column positions with vertical displacement
for x in np.arange(min_x, max_x + x_step, x_step):
# Create and iterate through list of y values that will define column positions with horizontal displacement
for y in np.arange(min_y, max_y + y_step, y_step):
# Create hexagon with specified side length
hexagon = [[x + math.cos(math.radians(angle)) * side_length, y + math.sin(math.radians(angle)) * side_length] for angle in range(0, 360, 60)]
# Append hexagon to list
cells_list.append(Polygon(hexagon))
# Check if column number is even
if column_number % 2 == 0:
# If even, expand minimum and maximum y values by apothem value to vertically displace next row
# Expand values so as to not miss any features near the feature extent
min_y -= apothem
max_y += apothem
# Else, odd
else:
# Revert minimum and maximum y values back to original
min_y += apothem
max_y -= apothem
# Increase column number by 1
column_number += 1
area = (3 * math.sqrt(3) * pow(side_length,2)) / 2
# Else, raise error
else:
raise Exception("Specify a rectangle or hexagon as the grid shape.")
# Create grid from list of cells
grid = gp.GeoDataFrame(cells_list, columns = ['geometry'], crs = "epsg:27561")
# Create a column that assigns each grid a number
grid["Grid_ID"] = np.arange(len(grid))
# Return grid
return grid,area
Create hexagons¶
In [ ]:
# Create heaxagon
area_grid,area = create_grid(feature = paris_arrondis_accidents_by_pop2023, shape = 'hexagon', side_length = 200)
#cell_grid["cell_id"] = cell_grid.index + 1
#cell_grid.head(5)
area_grid.plot()
Out[ ]:
<Axes: >
Remove cells outside Paris¶
In [ ]:
area_grid_paris = gp.sjoin(area_grid, arrondis_gdf, how='inner', predicate='intersects')
# Remove fields from spatial join
area_grid_paris = area_grid_paris.loc[:,~area_grid_paris.columns.str.startswith('index_')]
area_grid_paris.reset_index(inplace=True)
area_grid_paris.plot()
Out[ ]:
<Axes: >
Later, we fuse the cells with the accidents layer¶
In [ ]:
## Remove index columns from previous merge operations, if necessary
paris_accidents = paris_accidents.loc[:,~paris_accidents.columns.str.startswith('index_')]
paris_accidents_by_pop2023_arrondi = paris_accidents_by_pop2023_arrondi.loc[:,~paris_accidents_by_pop2023_arrondi.columns.str.startswith('index_')]
print("Dataframes have columns with name index ? :", any(paris_accidents.columns.str.startswith('index_'))
and any(paris_accidents_by_pop2023_arrondi.columns.str.startswith('index_')))
Dataframes have columns with name index ? : False
In [ ]:
# Grab all dataset of accidents, within paris
accidents_paris = gp.sjoin(accidents_by_region_and_name, paris_accidents_by_pop2023_arrondi, how='inner', predicate='within')
accidents_paris = accidents_paris.loc[:,~accidents_paris.columns.str.startswith('index_')]
accidents_paris.columns = accidents_paris.columns.str.rstrip("_left")
accidents_paris.columns = accidents_paris.columns.str.rstrip("_right")
accidents_paris.plot()
Out[ ]:
<Axes: >
In [ ]:
accidents_paris.head()
Out[ ]:
| num_acc | da | an | mois | jou | hrmn | dep | com | la | lon | ... | choc | manv | vehiculeid | typevehicules | manoeuvehicules | numvehicules | code | nom | pop_2023 | num_accidents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25055 | 201800041647 | 2018-01-13 | 2018 | janvier | samedi | 21:10 | 94 | 94069 | 48.817260 | 2.458920 | ... | 0.0 | 1.0 | 202100048294A01 | NaN | NaN | NaN | 75 | Paris | 2102650 | 375 |
| 25118 | 201800054676 | 2018-03-06 | 2018 | mars | mardi | 95:0 | 75 | 75120 | 48.846600 | 2.415980 | ... | 0.0 | 1.0 | 202100048294A01 | NaN | NaN | NaN | 75 | Paris | 2102650 | 375 |
| 25122 | 201900035012 | 2019-06-23 | 2019 | juin | dimanche | 08:10 | 75 | 75112 | 48.844343 | 2.441665 | ... | 0.0 | 1.0 | 202100048294A01 | NaN | NaN | NaN | 75 | Paris | 2102650 | 375 |
| 25143 | 202000019957 | 2020-12-19 | 2020 | décembre | samedi | 17:15 | 94 | 94033 | 48.844350 | 2.450700 | ... | 0.0 | 1.0 | 202100048294A01 | NaN | NaN | NaN | 75 | Paris | 2102650 | 375 |
| 25148 | 202100021295 | 2021-09-02 | 2021 | septembre | jeudi | 08:45 | 75 | 75112 | 48.818409 | 2.451584 | ... | 0.0 | 1.0 | 202100048294A01 | NaN | NaN | NaN | 75 | Paris | 2102650 | 375 |
5 rows × 95 columns
Aggreate accidents by cell grid¶
In [ ]:
#######
# Perform spatial join, merging attribute table of wells point and that of the cell with which it intersects
# op = "intersects" also counts those that fall on a cell boundary (between two cells)
# op = "within" will not count those fall on a cell boundary
points_within = points_within.loc[:,~points_within.columns.str.startswith('index_')]
# Merging accidents data with grid data by spatial intersection boundary
grid_accidents = gp.sjoin(points_within, area_grid_paris, how='left', predicate='within')
# Add a field with constant value of 1
grid_accidents['n_acc'] = 1
# Compute stats per grid cell -- aggregate fires to grid cells with dissolve
dissolve = grid_accidents.dissolve(by="index_right", aggfunc="count")
# put this into cell
area_grid_paris.loc[dissolve.index, 'n_acc'] = dissolve.n_acc.values
# Fill the NaN values (cells without any points) with 0 if we want to see
area_grid_paris['n_acc'] = area_grid_paris['n_acc'].fillna(0)
#cell_grid = cell_grid.within(paris_accidents_by_pop2023_arrondi)]
In [ ]:
# Plot data
ax = paris_accidents_by_pop2023_arrondi.plot(markersize=.1, figsize=(15, 10),color="None",edgecolor="red",legend=True)
legend_intervals = [int(area_grid_paris["n_acc"].min()),5,10,15,int(area_grid_paris["n_acc"].max())]
accidents_paris.plot(ax = ax,marker = 'o', color = 'dimgray', markersize = 3)
area_grid_paris.plot(ax = ax, column = "n_acc",
cmap=cmap,edgecolor="lightseagreen", linewidth = 0.5, alpha = 0.8,legend = True,
legend_kwds={
"shrink":.68,
"format": "%g",
'label': "Accidents",
"pad": 0.01,
#"ticks" : legend_intervals
})
# Set title
ax.set_title(f'Grid of accidents per ±{area:.0f} m2', fontdict = {'fontsize': '15', 'fontweight' : '3'})
plt.show()
Lets test some interpolation methods to fill the empty cells and get better results¶
Give a small subset of points to train on¶
In [ ]:
def f(x):
"""Function to be approximated by polynomial interpolation."""
return x * np.sin(x)
In [ ]:
samples = area_grid_paris["n_acc"].to_list()
# whole range we want to plot
x_plot = np.linspace(min(samples), max(samples), len(samples))
# To make it interesting, we only give a small subset of points to train on.
x_train = samples
rng = np.random.RandomState(0)
x_train = np.sort(rng.choice(x_train, size=10, replace=False))
y_train = f(x_train)
# create 2D-array versions of these arrays to feed to transformers
X_train = x_train[:, np.newaxis]
X_plot = x_plot[:, np.newaxis]
In [ ]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
# plot function
lw = 2
fig, ax = plt.subplots()
ax.set_prop_cycle(
color=["black", "teal", "yellowgreen", "gold", "darkorange", "tomato"]
)
ax.plot(x_plot, f(x_plot), linewidth=lw, label="ground truth")
# plot training points
ax.scatter(x_train, y_train, label="training points")
# polynomial features
for degree in [3, 4, 5]:
model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1e-3))
model.fit(X_train, y_train)
y_plot = model.predict(X_plot)
ax.plot(x_plot, y_plot, label=f"degree {degree}")
# B-spline with 4 + 3 - 1 = 6 basis functions
model = make_pipeline(SplineTransformer(n_knots=4, degree=3), Ridge(alpha=1e-3))
model.fit(X_train, y_train)
y_plot = model.predict(X_plot)
ax.plot(x_plot, y_plot, label="B-spline")
ax.legend(loc="lower center")
ax.set_ylim(-40, 40)
plt.show()
Splines make a good job on fitting well the data, and provide the same time, some paramenters to control extrapolation. However, seasonal effects may cut an expected periodic continuation of the underlying signal. Periodic splines could be used in such case¶
In [ ]:
def g(x):
"""Function to be approximated by periodic spline interpolation."""
return np.sin(x) - 0.7 * np.cos(x * 3)
In [ ]:
y_train = g(x_train)
# Extend the test data into the future:
x_plot_ext = np.linspace(min(samples), max(samples) + 10, len(samples) + 100)
X_plot_ext = x_plot_ext[:, np.newaxis]
lw = 2
fig, ax = plt.subplots()
ax.set_prop_cycle(color=["black", "tomato", "teal"])
ax.plot(x_plot_ext, g(x_plot_ext), linewidth=lw, label="ground truth")
ax.scatter(x_train, y_train, label="training points")
for transformer, label in [
(SplineTransformer(degree=3, n_knots=10), "spline"),
(
SplineTransformer(
degree=3,
knots=np.linspace(0, 2 * np.pi, 10)[:, None],
extrapolation="periodic",
),
"periodic spline",
),
]:
model = make_pipeline(transformer, Ridge(alpha=1e-3))
model.fit(X_train, y_train)
y_plot_ext = model.predict(X_plot_ext)
ax.plot(x_plot_ext, y_plot_ext, label=label)
ax.legend()
fig.show()
/tmp/ipykernel_13264/4214459259.py:30: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. fig.show()
In [ ]:
sns.displot(samples,kind="kde")
plt.title("Number of accidents per cell grid in Paris")
/home/paulo/Projects/bike_risk_map/bike_env/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Out[ ]:
Text(0.5, 1.0, 'Number of accidents per cell grid in Paris')
In [ ]: